19 csv文件处理
CSV(Comma-Separated Values)是最常见的数据格式之一——Excel导出、数据库备份、日志导出都用它。Python的csv模块就是专门处理CSV的。
一、读取CSV
1.1 csv.reader()
python
import csv
# 读取CSV文件
with open("data.csv", "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
print(row)每行返回一个字符串列表:
['name', 'age', 'city']
['大志', '28', '北京']
['小明', '25', '上海']重要:打开文件时必须指定newline='':
python
import csv
with open("data.csv", "r", newline="", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
print(row)1.2 跳过表头
python
import csv
with open("data.csv", "r", newline="", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader) # 跳过表头
print(f"列名: {header}")
for row in reader:
print(row)1.3 转换为列表
python
import csv
with open("data.csv", "r", newline="", encoding="utf-8") as f:
reader = csv.reader(f)
rows = list(reader)
print(rows)
# [['name', 'age', 'city'], ['大志', '28', '北京'], ['小明', '25', '上海']]二、写入CSV
2.1 csv.writer()
python
import csv
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["name", "age", "city"]) # 写一行
writer.writerow(["大志", 28, "北京"])
writer.writerows([ # 写多行
["小明", 25, "上海"],
["小红", 23, "广州"]
])2.2 自定义分隔符
python
import csv
# 使用制表符分隔
with open("data.tsv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f, delimiter="\t")
writer.writerow(["name", "age", "city"])
writer.writerow(["大志", 28, "北京"])2.3 自定义引号
python
import csv
# 所有字段都加引号
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
writer.writerow(["name", "age", "city"])
writer.writerow(["大志", 28, "北京"])引号常量:
| 常量 | 说明 |
|---|---|
QUOTE_ALL | 所有字段都加引号 |
QUOTE_MINIMAL | 只在需要时加引号(默认) |
QUOTE_NONNUMERIC | 非数字字段加引号 |
QUOTE_NONE | 不加引号 |
三、字典模式
3.1 DictReader
每行返回一个字典,键是表头。
python
import csv
with open("data.csv", "r", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
print(row)输出:
{'name': '大志', 'age': '28', 'city': '北京'}
{'name': '小明', 'age': '25', 'city': '上海'}访问更方便:
python
import csv
with open("data.csv", "r", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
print(f"{row['name']}, {row['age']}岁, 在{row['city']}")3.2 DictWriter
python
import csv
with open("output.csv", "w", newline="", encoding="utf-8") as f:
fieldnames = ["name", "age", "city"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() # 写表头
writer.writerow({"name": "大志", "age": 28, "city": "北京"})
writer.writerows([
{"name": "小明", "age": 25, "city": "上海"},
{"name": "小红", "age": 23, "city": "广州"}
])四、Dialect方言
4.1 预定义方言
python
import csv
# excel方言(默认)
print(csv.list_dialects()) # ['excel', 'excel-tab', 'unix']
# 使用unix方言(用\n换行)
with open("data.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f, dialect="unix")
writer.writerow(["name", "age"])
writer.writerow(["大志", 28])4.2 自定义方言
python
import csv
# 注册自定义方言
csv.register_dialect("custom",
delimiter="|",
quotechar="'",
quoting=csv.QUOTE_MINIMAL
)
with open("data.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f, dialect="custom")
writer.writerow(["name", "age", "city"])
writer.writerow(["大志", 28, "北京"])五、Sniffer:自动检测格式
5.1 检测分隔符
python
import csv
# 自动检测CSV格式
with open("data.csv", "r", encoding="utf-8") as f:
sample = f.read(1024) # 读取样本
sniffer = csv.Sniffer()
dialect = sniffer.sniff(sample)
print(f"分隔符: {dialect.delimiter}")
print(f"引号符: {dialect.quotechar}")5.2 检测是否有表头
python
import csv
with open("data.csv", "r", encoding="utf-8") as f:
sample = f.read(1024)
sniffer = csv.Sniffer()
has_header = sniffer.has_header(sample)
print(f"有表头: {has_header}")六、处理特殊字符
6.1 包含逗号的字段
python
import csv
from io import StringIO
# 字段包含逗号时,自动加引号
output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "description"])
writer.writerow(["大志", "喜欢Python, LangChain"])
print(output.getvalue())
# name,description
# 大志,"喜欢Python, LangChain"6.2 包含换行的字段
python
import csv
from io import StringIO
output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "bio"])
writer.writerow(["大志", "第一行\n第二行"])
print(output.getvalue())
# name,bio
# 大志,"第一行
# 第二行"6.3 包含引号的字段
python
import csv
from io import StringIO
output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "quote"])
writer.writerow(["大志", '他说"Hello"'])
print(output.getvalue())
# name,quote
# 大志,"他说""Hello"""七、实战场景
7.1 数据转换
python
import csv
# CSV转JSON
import json
def csv_to_json(csv_file, json_file):
with open(csv_file, "r", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
data = list(reader)
with open(json_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
csv_to_json("data.csv", "data.json")7.2 数据过滤
python
import csv
# 过滤CSV数据
def filter_csv(input_file, output_file, condition):
with open(input_file, "r", newline="", encoding="utf-8") as fin, \
open(output_file, "w", newline="", encoding="utf-8") as fout:
reader = csv.DictReader(fin)
writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
if condition(row):
writer.writerow(row)
# 过滤年龄大于25的记录
filter_csv("data.csv", "filtered.csv", lambda r: int(r["age"]) > 25)7.3 合并CSV文件
python
import csv
from pathlib import Path
def merge_csv_files(input_dir, output_file):
files = list(Path(input_dir).glob("*.csv"))
with open(output_file, "w", newline="", encoding="utf-8") as fout:
writer = None
for filepath in files:
with open(filepath, "r", newline="", encoding="utf-8") as fin:
reader = csv.DictReader(fin)
if writer is None:
writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
writer.writerow(row)
merge_csv_files("./data", "merged.csv")八、总结
csv模块的核心:
| 类/函数 | 用途 |
|---|---|
csv.reader() | 读取CSV,返回列表 |
csv.writer() | 写入CSV |
csv.DictReader() | 读取CSV,返回字典 |
csv.DictWriter() | 写入CSV,接收字典 |
csv.Sniffer() | 自动检测CSV格式 |
使用要点:
- 打开文件时加
newline='' - 指定
encoding='utf-8' - 包含逗号/换行的字段会自动加引号
DictReader/DictWriter更方便访问字段
CSV虽然简单,但用得非常多。掌握csv模块,数据导入导出就轻松了。